library(vegawidget)
library(readr)
library(dplyr)
library(vegalite)
library(lubridate)
library(vegabrite)
library(tidyr)
library(jsonlite)Data Wrangling for Graphics
Exercise 1 (supply and demand)
supply_demand <- read_csv("https://calvin-data304.netlify.app/data/swd-lets-practice-ex-2-3.csv")
supply_demand |>
pivot_longer(demand:capacity, names_to = "type", values_to = "number")# A tibble: 24 × 3
date type number
<chr> <chr> <dbl>
1 2019-04 demand 46193
2 2019-04 capacity 29263
3 2019-05 demand 49131
4 2019-05 capacity 28037
5 2019-06 demand 50124
6 2019-06 capacity 21596
7 2019-07 demand 48850
8 2019-07 capacity 25895
9 2019-08 demand 47602
10 2019-08 capacity 25813
# ℹ 14 more rows
supply_demand2 <- supply_demand |>
mutate(unmet_demand = demand - capacity)
# pivot_longer(capacity:unmet_demand, names_to = "type", values_to = "number")vl_chart(supply_demand) |>
vl_fold(
c("capacity", "demand"),
as = c("type", "value")) |>
vl_mark_bar() |>
vl_encode_x("date:O") |>
vl_encode_xOffset("type:N") |>
# vl_facet_column("type:N") |>
vl_encode_y("value:Q") |>
vl_encode_color("type:N", scale = list(domain = c("demand", "capacity")))vl_chart(supply_demand) |>
vl_fold(
c("capacity", "demand"),
as = c("type", "value")) |>
vl_mark_line(point = TRUE) |>
vl_encode_x("date:T") |>
vl_encode_xOffset("type:N") |>
vl_encode_y("value:Q") |>
vl_encode_color("type:N", scale = list(domain = c("demand", "capacity")))demandplot <- vl_chart(supply_demand) |>
vl_mark_bar() |>
vl_encode_x("date:T") |>
vl_encode_y("demand:Q") |>
vl_encode_color(value = "red")
capacityplot <- vl_chart(supply_demand) |>
vl_mark_bar(width = 10) |>
vl_encode_x("date:T") |>
vl_encode_y("capacity:Q") |>
vl_encode_color(value = "blue") |>
vl_encode_opacity(value = 0.7)
vl_layer(demandplot, capacityplot)demandplot <- vl_chart(supply_demand) |>
vl_mark_bar() |>
vl_encode_x("date:T") |>
vl_encode_y("demand:Q") |>
vl_encode_color(value = "red", title = "unmet demands")Warning: Invalid schema for object passed to or created by
modify_inner_spec.vegaspec_unit
capacityplot <- vl_chart(supply_demand) |>
vl_mark_bar() |>
vl_encode_x("date:T") |>
vl_encode_y("capacity:Q") |>
vl_encode_color(value = "blue", title = "capacity") |>
vl_encode_opacity(value = 0.7)Warning: Invalid schema for object passed to or created by
modify_inner_spec.vegaspec_unit
vl_layer(demandplot, capacityplot)demandplot <- vl_chart(supply_demand) |>
vl_mark_point() |>
vl_encode_x("date:T") |>
vl_encode_y("demand:Q") |>
vl_encode_color(value = "red")
capacityplot <- vl_chart(supply_demand) |>
vl_mark_point() |>
vl_encode_x("date:T") |>
vl_encode_y("capacity:Q") |>
vl_encode_color(value = "blue")
lineplot <- vl_chart(supply_demand) |>
vl_fold(
c("capacity", "demand"),
as = c("type", "value")) |>
vl_mark_line() |>
vl_encode_x("date:T") |>
vl_encode_y("value:Q") |>
vl_encode_detail("date:T")
vl_layer(demandplot, capacityplot, lineplot)vl_chart(supply_demand2) |>
vl_mark_line(point = TRUE) |>
vl_encode_x("date:T") |>
vl_encode_y("unmet_demand:Q")Excercise 2 (jobs)
jobs_url <-
"https://cdn.jsdelivr.net/npm/vega-datasets@2.8.0/data/jobs.json"
#| fig-cap: "This is a caption for the figure."
vl_chart() |>
vl_mark_point() |>
vl_pivot("year", groupby = list("job", "sex"), value = "perc") |>
vl_encode_x("1850:Q") |>
vl_scale_x(type = "symlog", constant = 0.00001) |> # note: scale must be set quite small
vl_encode_y("2000:Q") |>
vl_scale_y(type = "symlog", constant = 0.00001) |> # note: scale must be set quite small
vl_facet_column("sex:N", title = "") |>
vl_encode_tooltip_array(list("job", "sex", "1850", "2000")) |>
vl_add_properties(
title = list(
text = "Percent of people working various jobs",
subtitle = "Each dot represents the one occupation. Hover to see which occupation it is."
)) |>
vl_add_data_url(jobs_url) “perc” means?
sum all values of perc in each year separated by men and women… explain what perc means
jobs_data <- fromJSON("https://cdn.jsdelivr.net/npm/vega-datasets@2.8.0/data/jobs.json")jobs_summary <- jobs_data %>%
group_by(year, sex) %>%
summarise(total_perc = sum(perc, na.rm = TRUE))`summarise()` has grouped output by 'year'. You can override using the
`.groups` argument.
glimpse(jobs_summary)Rows: 30
Columns: 3
Groups: year [15]
$ year <int> 1850, 1850, 1860, 1860, 1870, 1870, 1880, 1880, 1900, 1900,…
$ sex <chr> "men", "women", "men", "women", "men", "women", "men", "wom…
$ total_perc <dbl> 0.990373464, 0.009626536, 0.856455691, 0.143544309, 0.84560…
Looking at the data, the “perc” values add up to one for each year. This means that if we sum the percentages of men and women for the same year, the total will always be one. In other words, “perc” represents the proportion of jobs each year (where the total is 1, not 100%) for both men and women. For example, in 1850, 0.990373464 (or 99%) of workers were men, while 0.009626536 (or 1%) were women. This makes sense, as most women were likely housewives at that time.
modifying graphic
jobs_data_transformed <- jobs_data |>
pivot_wider(names_from = sex, values_from = perc, id_cols = c(job, year))
# View result
head(jobs_data_transformed)# A tibble: 6 × 4
job year men women
<chr> <int> <dbl> <dbl>
1 Accountant / Auditor 1850 0.000131 0
2 Accountant / Auditor 1860 0.000214 0
3 Accountant / Auditor 1870 0.0000998 0
4 Accountant / Auditor 1880 0.000125 0
5 Accountant / Auditor 1900 0.000396 0.0000272
6 Accountant / Auditor 1910 0 0
vl_chart(jobs_data_transformed) |>
vl_mark_point() |>
vl_filter("datum.year == 2000 | datum.year == 1990 | datum.year == 1980 | datum.year == 1970") |>
vl_facet("year:O", title = "", columns = 2) |>
vl_encode_x("men:Q") |>
vl_scale_x(type = "symlog", constant = 0.00001) |> # note: scale must be set quite small
vl_encode_y("women:Q") |>
vl_scale_y(type = "symlog", constant = 0.00001) |> # note: scale must be set quite small
vl_encode_color("job:N") |>
vl_encode_tooltip_array(list("job", "men", "women")) |>
vl_axis_x(format=".0%") |>
vl_axis_y(format=".0%") |>
vl_add_properties(
title = list(
text = "Percent of people working various jobs",
subtitle = "Each dot represents the one occupation. Hover to see which occupation it is."
))percentage of men and women who worked in various jobs
#join data because jobs_summary contain total % for men and women in each year
jobs_summary <-
jobs_summary |>
pivot_wider(names_from = sex, values_from = total_perc, id_cols = year) |>
rename_with(~ paste0(.x, "_total_perc"), -year)jobs_data_transformed_join <- left_join(jobs_data_transformed, jobs_summary, by = "year") |>
mutate(
perc_men = men / men_total_perc,
perc_women = women / women_total_perc
)vl_chart(jobs_data_transformed_join) |>
vl_mark_point() |>
vl_filter("datum.year == 2000 | datum.year == 1990 | datum.year == 1980 | datum.year == 1970") |>
vl_facet("year:O", title = "", columns = 2) |>
vl_encode_x("perc_men:Q") |>
vl_scale_x(type = "symlog", constant = 0.00001) |> # note: scale must be set quite small
vl_encode_y("perc_women:Q") |>
vl_scale_y(type = "symlog", constant = 0.00001) |> # note: scale must be set quite small
vl_encode_color("job:N") |>
vl_encode_tooltip_array(list("job", "perc_men", "perc_women")) |>
vl_axis_x(format=".0%") |>
vl_axis_y(format=".0%") |>
vl_add_properties(
title = list(
text = "Percent of people working various jobs in proportion to their gender",
subtitle = "Each dot represents the one occupation. Hover to see which occupation it is."
))#left join vegabrite
jobs_data_transformed |>
vl_lookup(
lookup = "year",
from = list(
data = list(jobs_data_transformed),
key = "year",
fields = list("total_perc")
)
)
Exercise 3 (price at the pump)
gas_prices <- read_csv("https://calvin-data304.netlify.app/data/pump_price_for_gasoline_us_per_liter.csv")
countries <- read_csv("https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.csv")#giving both datasets common names
countries <- countries %>% rename(country = name)
joined_countries <- gas_prices %>% inner_join(countries, by = "country")
# Identify unmatched countries
unmatched_country_gas <- gas_prices %>% anti_join(countries, by = "country") # Countries in gas_prices but not in countries
unmatched_country_countries <- countries %>% anti_join(gas_prices, by = "country") # Countries in countries but not in gas_prices
# View results
print("Successfully Matched Data:")[1] "Successfully Matched Data:"
print(joined_countries)# A tibble: 154 × 37
country `1991` `1992` `1993` `1994` `1995` `1996` `1997` `1998` `1999` `2000`
<chr> <dbl> <dbl> <lgl> <lgl> <dbl> <lgl> <lgl> <dbl> <lgl> <dbl>
1 Afghan… NA NA NA NA NA NA NA NA NA NA
2 Angola NA NA NA NA NA NA NA 0.38 NA 0.3
3 Albania NA NA NA NA NA NA NA 0.86 NA 0.57
4 Andorra NA NA NA NA NA NA NA NA NA NA
5 Argent… NA 0.79 NA NA 0.6 NA NA 0.94 NA 1.07
6 Armenia NA NA NA NA NA NA NA 0.49 NA 0.55
7 Antigu… NA NA NA NA NA NA NA NA NA 0.56
8 Austra… NA NA NA NA NA NA NA 0.46 NA 0.57
9 Austria NA NA NA NA 1.15 NA NA 1.04 NA 0.82
10 Azerba… NA NA NA NA NA NA NA 0.46 NA 0.39
# ℹ 144 more rows
# ℹ 26 more variables: `2001` <lgl>, `2002` <dbl>, `2003` <lgl>, `2004` <dbl>,
# `2005` <lgl>, `2006` <dbl>, `2007` <lgl>, `2008` <dbl>, `2009` <lgl>,
# `2010` <dbl>, `2011` <lgl>, `2012` <dbl>, `2013` <lgl>, `2014` <dbl>,
# `2015` <lgl>, `2016` <chr>, `alpha-2` <chr>, `alpha-3` <chr>,
# `country-code` <chr>, `iso_3166-2` <chr>, region <chr>, `sub-region` <chr>,
# `intermediate-region` <chr>, `region-code` <chr>, …
print("Countries in gas_prices but NOT in countries:")[1] "Countries in gas_prices but NOT in countries:"
print(unmatched_country_gas$country) [1] "UAE" "Bolivia" "Brunei" "Cote d'Ivoire"
[5] "Congo, Dem. Rep." "Congo, Rep." "Cape Verde" "Czech Republic"
[9] "UK" "Hong Kong, China" "Iran" "Kyrgyz Republic"
[13] "South Korea" "Kosovo" "Lao" "St. Lucia"
[17] "Moldova" "Netherlands" "North Korea" "Palestine"
[21] "Russia" "Slovak Republic" "Syria" "Turkey"
[25] "Tanzania" "USA" "Venezuela" "Vietnam"
print("Countries in countries but NOT in gas_data:")[1] "Countries in countries but NOT in gas_data:"
print(unmatched_country_countries$country) [1] "Åland Islands"
[2] "American Samoa"
[3] "Anguilla"
[4] "Antarctica"
[5] "Aruba"
[6] "Bermuda"
[7] "Bolivia, Plurinational State of"
[8] "Bonaire, Sint Eustatius and Saba"
[9] "Bouvet Island"
[10] "British Indian Ocean Territory"
[11] "Brunei Darussalam"
[12] "Cabo Verde"
[13] "Cayman Islands"
[14] "Christmas Island"
[15] "Cocos (Keeling) Islands"
[16] "Comoros"
[17] "Congo"
[18] "Congo, Democratic Republic of the"
[19] "Cook Islands"
[20] "Côte d'Ivoire"
[21] "Curaçao"
[22] "Czechia"
[23] "Dominica"
[24] "Equatorial Guinea"
[25] "Falkland Islands (Malvinas)"
[26] "Faroe Islands"
[27] "French Guiana"
[28] "French Southern Territories"
[29] "Gibraltar"
[30] "Greenland"
[31] "Guadeloupe"
[32] "Guam"
[33] "Guernsey"
[34] "Guinea-Bissau"
[35] "Heard Island and McDonald Islands"
[36] "Holy See"
[37] "Hong Kong"
[38] "Iran, Islamic Republic of"
[39] "Isle of Man"
[40] "Jersey"
[41] "Kiribati"
[42] "Korea, Democratic People's Republic of"
[43] "Korea, Republic of"
[44] "Kyrgyzstan"
[45] "Lao People's Democratic Republic"
[46] "Macao"
[47] "Martinique"
[48] "Mayotte"
[49] "Micronesia, Federated States of"
[50] "Moldova, Republic of"
[51] "Montserrat"
[52] "Netherlands, Kingdom of the"
[53] "New Caledonia"
[54] "Niue"
[55] "Norfolk Island"
[56] "Northern Mariana Islands"
[57] "Palau"
[58] "Palestine, State of"
[59] "Pitcairn"
[60] "Puerto Rico"
[61] "Réunion"
[62] "Russian Federation"
[63] "Saint Barthélemy"
[64] "Saint Helena, Ascension and Tristan da Cunha"
[65] "Saint Kitts and Nevis"
[66] "Saint Lucia"
[67] "Saint Martin (French part)"
[68] "Saint Pierre and Miquelon"
[69] "Saint Vincent and the Grenadines"
[70] "San Marino"
[71] "Sao Tome and Principe"
[72] "Seychelles"
[73] "Sint Maarten (Dutch part)"
[74] "Slovakia"
[75] "Solomon Islands"
[76] "South Georgia and the South Sandwich Islands"
[77] "Svalbard and Jan Mayen"
[78] "Syrian Arab Republic"
[79] "Taiwan, Province of China"
[80] "Tanzania, United Republic of"
[81] "Tokelau"
[82] "Tonga"
[83] "Türkiye"
[84] "Turks and Caicos Islands"
[85] "Tuvalu"
[86] "United Arab Emirates"
[87] "United Kingdom of Great Britain and Northern Ireland"
[88] "United States of America"
[89] "United States Minor Outlying Islands"
[90] "Venezuela, Bolivarian Republic of"
[91] "Viet Nam"
[92] "Virgin Islands (British)"
[93] "Virgin Islands (U.S.)"
[94] "Wallis and Futuna"
[95] "Western Sahara"
country_gas_final <- joined_countries|>
pivot_longer(cols =
c("1991","1992","1993","1994","1995","1996", "1997","1998","1999", "2000","2001", "2002","2003", "2004","2005","2006", "2007", "2008","2009", "2010", "2011","2012", "2013", "2014", "2015", "2016"),
names_to = "year",
values_to = "gas_prices",
values_transform = list(gas_prices = as.numeric))#cleaned_data_countries <- joined_countries |> drop_na()
vl_chart(country_gas_final)|>
vl_mark_bar() |>
vl_encode_x("year:T") |>
vl_encode_y("gas_prices:Q", title = "gas prices (US $)")|>
vl_encode_column("sub-region:N", columns = 2)#my xOffset doesn't seem to be working in grouping the bars together per subregion...
vl_chart(country_gas_final, width = 300) |>
vl_mark_bar() |>
vl_filter("datum.year == 2000 | datum.year == 2002 | datum.year == 2004 | datum.year == 2006 | datum.year == 2008 | datum.year == 2010 | datum.year == 2012 | datum.year == 2014 |datum.year == 2016") |>
vl_encode_x("year:O", title = "Year") |>
vl_encode_xOffset("sub-region:N") |> # Group bars by sub-region
vl_encode_y("gas_prices:Q", title = "Gas Prices (US $)") |>
vl_facet("region:N", columns = 2) |> # Facet by region
vl_encode_color("country:N", title = "Country") |> # Color bars by country
vl_encode_tooltip_array(list("country", "gas_prices", "sub-region", "country-code"))